{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "import re\n",
    "from nltk.corpus import stopwords\n",
    "from bs4 import BeautifulSoup\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "stemmer = WordNetLemmatizer()\n",
    "from scipy import sparse\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib.ticker import StrMethodFormatter\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from math import sqrt\n",
    "from keras.models import Sequential, load_model\n",
    "from keras.layers import Dense, Dropout\n",
    "from keras.layers.normalization import BatchNormalization\n",
    "from keras.callbacks import EarlyStopping, ModelCheckpoint"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def cleaning_text(review, remove_stopwords=False, Lem=False):\n",
    "    review_text = BeautifulSoup(review, \"html.parser\").get_text()\n",
    "    review_text = re.sub('[^a-zA-Z]',' ', review_text)\n",
    "    review_text = re.sub('\\s+',' ', review_text)\n",
    "    words = review_text.lower().split()\n",
    "    if remove_stopwords:\n",
    "        stops = set(stopwords.words(\"english\"))\n",
    "        words = [w for w in words if not w in stops]\n",
    "    if Lem:\n",
    "        words = [stemmer.lemmatize(w) for w in words] # Lemmatization\n",
    "    review_text = (' '.join([word for word in words]))\n",
    "    return(review_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_orig = pd.read_csv('C:\\\\Kaggle\\\\BooksPrice\\\\Participants_Data\\\\Data_Train01.csv', encoding='ISO-8859-1')\n",
    "test_orig = pd.read_csv('C:\\\\Kaggle\\\\BooksPrice\\\\Participants_Data\\\\Data_Test01.csv', encoding='ISO-8859-1')\n",
    "FeatureNames = pd.read_csv('C:\\\\Kaggle\\\\BooksPrice\\\\Participants_Data\\\\FeatureNames.csv', encoding='ISO-8859-1')\n",
    "\n",
    "train_other_models = pd.read_csv('C:\\\\Kaggle\\\\BooksPrice\\\\CV Scrd Trn Datasets\\\\20190930_XGB01_DS.csv', encoding='ISO-8859-1')\n",
    "test_other_models = pd.read_csv('C:\\\\Kaggle\\\\BooksPrice\\\\CV Scrd Tst Datasets\\\\20190930_XGB01_DS.csv', encoding='ISO-8859-1')\n",
    "\n",
    "train_other_models = train_other_models[['id','Price_Log_Pred','FOLD_NUM']]\n",
    "test_other_models = test_other_models[['id','Price_Log_Pred']]\n",
    "\n",
    "train = pd.merge(train_orig, train_other_models, on='id')\n",
    "test = pd.merge(test_orig, test_other_models, on='id')\n",
    "\n",
    "train['Price_Log'] = np.log10(train['Price']+1)\n",
    "train.hist(column='Price_Log')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "FeatureNames = FeatureNames['x'].values.tolist()\n",
    "FeatureNames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test[FeatureNames] = test[FeatureNames] / train[FeatureNames].max()\n",
    "test[FeatureNames].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train[FeatureNames] = train[FeatureNames] / train[FeatureNames].max()\n",
    "train[FeatureNames].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train['Synopsis2'] = train['Synopsis'].apply(lambda x: cleaning_text(x,True,False))\n",
    "test['Synopsis2'] = test['Synopsis'].apply(lambda x: cleaning_text(x,True,False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "vectorizer = CountVectorizer(max_features=2000, min_df=3, max_df=0.7, ngram_range=(1,3))\n",
    "vectorizer.fit(train['Synopsis2'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fold_list = list(train.FOLD_NUM.unique())\n",
    "fold_list.sort()\n",
    "fold_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def nn_model(input_shape):\n",
    "    model = Sequential()\n",
    "    model.add(Dense(200, input_dim=input_shape, activation='relu'))\n",
    "    model.add(BatchNormalization())\n",
    "    model.add(Dropout(0.4))\n",
    "    model.add(Dense(100, input_dim=input_shape, activation='relu'))\n",
    "    model.add(BatchNormalization())\n",
    "    model.add(Dropout(0.2))\n",
    "    model.add(Dense(50, input_dim=input_shape, activation='relu'))\n",
    "    model.add(BatchNormalization())\n",
    "    model.add(Dropout(0.1))\n",
    "    model.add(Dense(1))\n",
    "    return(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "keras01_Models = []\n",
    "IterationNum = 1\n",
    "batch_size = 64\n",
    "for fold_num in fold_list:\n",
    "    print(\"Running CV Iteration Num :\", IterationNum)\n",
    "    \n",
    "    temp_train = train[train['FOLD_NUM'] != fold_num]\n",
    "    temp_val = train[train['FOLD_NUM'] == fold_num]\n",
    "    \n",
    "    temp_train_tf = vectorizer.transform(temp_train['Synopsis2'])\n",
    "    temp_val_tf = vectorizer.transform(temp_val['Synopsis2'])\n",
    "    \n",
    "    temp_train_tf = sparse.hstack((temp_train_tf,sparse.csr_matrix(np.asmatrix(temp_train[FeatureNames].values))))\n",
    "    temp_val_tf = sparse.hstack((temp_val_tf,sparse.csr_matrix(np.asmatrix(temp_val[FeatureNames].values))))\n",
    "    \n",
    "    temp_train_tf = temp_train_tf.toarray()\n",
    "    temp_val_tf = temp_val_tf.toarray()\n",
    "    \n",
    "    model = nn_model(temp_train_tf.shape[1])\n",
    "    model.compile(loss = 'mean_squared_error', optimizer='adam',metrics = ['mean_squared_error'])\n",
    "    print(model.summary())\n",
    "    \n",
    "    model_weights_save_path = 'C:/Kaggle/BooksPrice/KerasModels/'\n",
    "    file_name = \"201901001_Keras02_Model_Weights_Fold_\"+str(fold_num)+'.h5'\n",
    "    final_path = model_weights_save_path+file_name\n",
    "    print(\"Model Weights File Name : \",final_path)\n",
    "    keras01_Models.append(final_path)\n",
    "    \n",
    "    es = EarlyStopping(mode='min',\n",
    "                       verbose=1,\n",
    "                       patience=10)\n",
    "    \n",
    "    checkpointer = ModelCheckpoint(filepath=final_path,\n",
    "                                   mode='min',\n",
    "                                   verbose=1,\n",
    "                                   save_best_only=True)\n",
    "    \n",
    "    history = model.fit(temp_train_tf, temp_train['Price_Log'].values,\n",
    "                        epochs = 500,\n",
    "                        batch_size = batch_size,\n",
    "                        verbose = 1,\n",
    "                        shuffle = 'batch',\n",
    "                        validation_data = (temp_val_tf, temp_val['Price_Log'].values),\n",
    "                        callbacks = [es,checkpointer])\n",
    "    \n",
    "    \n",
    "    loaded_model = load_model(final_path)\n",
    "    Y_temp_val_pred = loaded_model.predict(temp_val_tf)\n",
    "\n",
    "    temp_val['Price_Log_Pred_Keras'] = Y_temp_val_pred\n",
    "\n",
    "    print('Completed for Fold - ',fold_num)\n",
    "    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_Keras'])))\n",
    "    \n",
    "    if(IterationNum == 1):\n",
    "        CV_SCORED_DATA = temp_val.copy(deep=True)\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "    else:\n",
    "        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "\n",
    "    IterationNum = IterationNum + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(keras01_Models)\n",
    "print(\"Keras 01 CV RMSLE = \",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_Keras'])))\n",
    "print(\"CV 1-RMSLE = \",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_Keras'])))\n",
    "# LB SCORE : 0.7716"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "CV_SCORED_DATA.to_csv(\"C:\\\\Kaggle\\\\BooksPrice\\\\CV Scrd Trn Datasets\\\\201901001_Keras02_DS.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "test_tf = vectorizer.transform(test['Synopsis2'])\n",
    "test_tf = sparse.hstack((test_tf,sparse.csr_matrix(np.asmatrix(test[FeatureNames].values))))\n",
    "print(test_tf.shape)\n",
    "\n",
    "test_tf = test_tf.toarray()\n",
    "\n",
    "test_preds = np.zeros((test.shape[0],1))\n",
    "\n",
    "for fname in keras01_Models:\n",
    "    print(\"Running for : \",fname)\n",
    "    loaded_model = load_model(fname)\n",
    "    test_preds = test_preds + loaded_model.predict(test_tf)\n",
    "    \n",
    "test_preds = test_preds / len(keras01_Models)\n",
    "\n",
    "test['Price_Log_Pred_Keras'] = test_preds\n",
    "test.to_csv(\"C:\\\\Kaggle\\\\BooksPrice\\\\CV Scrd Tst Datasets\\\\201901001_Keras02_DS.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test['Price_Log_Pred_Keras'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission = pd.read_excel('C:/Kaggle/BooksPrice/Participants_Data/Sample_Submission.xlsx', encoding='ISO-8859-1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_preds2 = (10**test_preds) - 1\n",
    "pd.DataFrame(test_preds2).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission['Price'] = test_preds2\n",
    "submission.to_excel('C:\\\\Kaggle\\\\BooksPrice\\\\Submissions\\\\201901001_Keras02_DS.xlsx', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
